{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ubuntu/anaconda3/lib/python3.6/site-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "/home/ubuntu/anaconda3/lib/python3.6/site-packages/matplotlib/__init__.py:1067: UserWarning: Duplicate key in file \"/home/ubuntu/.config/matplotlib/matplotlibrc\", line #2\n",
      "  (fname, cnt))\n",
      "/home/ubuntu/anaconda3/lib/python3.6/site-packages/matplotlib/__init__.py:1067: UserWarning: Duplicate key in file \"/home/ubuntu/.config/matplotlib/matplotlibrc\", line #3\n",
      "  (fname, cnt))\n"
     ]
    }
   ],
   "source": [
    "from __future__ import absolute_import\n",
    "from __future__ import division\n",
    "from __future__ import print_function\n",
    "\n",
    "import collections\n",
    "import os\n",
    "from bert import modeling\n",
    "from bert import optimization\n",
    "from bert import tokenization\n",
    "import tensorflow as tf\n",
    "from sklearn.metrics import f1_score,precision_score,recall_score\n",
    "from tensorflow.python.ops import math_ops\n",
    "import tf_metrics\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "class InputExample(object):\n",
    "    \"\"\"A single training/test example for simple sequence classification.\"\"\"\n",
    "\n",
    "    def __init__(self, guid, text, label=None):\n",
    "        \"\"\"Constructs a InputExample.\n",
    "\n",
    "        Args:\n",
    "          guid: Unique id for the example.\n",
    "          text_a: string. The untokenized text of the first sequence. For single\n",
    "            sequence tasks, only this sequence must be specified.\n",
    "          label: (Optional) string. The label of the example. This should be\n",
    "            specified for train and dev examples, but not for test examples.\n",
    "        \"\"\"\n",
    "        self.guid = guid\n",
    "        self.text = text\n",
    "        self.label = label\n",
    "\n",
    "\n",
    "class InputFeatures(object):\n",
    "    \"\"\"A single set of features of data.\"\"\"\n",
    "\n",
    "    def __init__(self, input_ids, input_mask, segment_ids, label_ids,):\n",
    "        self.input_ids = input_ids\n",
    "        self.input_mask = input_mask\n",
    "        self.segment_ids = segment_ids\n",
    "        self.label_ids = label_ids\n",
    "        #self.label_mask = label_mask\n",
    "\n",
    "\n",
    "class DataProcessor(object):\n",
    "    \"\"\"Base class for data converters for sequence classification data sets.\"\"\"\n",
    "\n",
    "    def get_train_examples(self, data_dir):\n",
    "        \"\"\"Gets a collection of `InputExample`s for the train set.\"\"\"\n",
    "        raise NotImplementedError()\n",
    "\n",
    "    def get_dev_examples(self, data_dir):\n",
    "        \"\"\"Gets a collection of `InputExample`s for the dev set.\"\"\"\n",
    "        raise NotImplementedError()\n",
    "\n",
    "    def get_labels(self):\n",
    "        \"\"\"Gets the list of labels for this data set.\"\"\"\n",
    "        raise NotImplementedError()\n",
    "\n",
    "    @classmethod\n",
    "    def _read_data(cls, input_file):\n",
    "        \"\"\"Reads a BIO data.\"\"\"\n",
    "        with open(input_file) as f:\n",
    "            lines = []\n",
    "            words = []\n",
    "            labels = []\n",
    "            for line in f:\n",
    "                contends = line.strip()\n",
    "                word = line.strip().split(' ')[0]\n",
    "                label = line.strip().split(' ')[-1]\n",
    "                if contends.startswith(\"-DOCSTART-\"):\n",
    "                    words.append('')\n",
    "                    continue\n",
    "                if len(contends) == 0 and words[-1] == '.':\n",
    "                    l = ' '.join([label for label in labels if len(label) > 0])\n",
    "                    w = ' '.join([word for word in words if len(word) > 0])\n",
    "                    lines.append([l, w])\n",
    "                    words = []\n",
    "                    labels = []\n",
    "                    continue\n",
    "                words.append(word)\n",
    "                labels.append(label)\n",
    "            return lines\n",
    "\n",
    "class NerProcessor(DataProcessor):\n",
    "    def get_train_examples(self, data_dir):\n",
    "        return self._create_example(\n",
    "            self._read_data(os.path.join(data_dir, \"train.txt\")), \"train\"\n",
    "        )\n",
    "\n",
    "    def get_dev_examples(self, data_dir):\n",
    "        return self._create_example(\n",
    "            self._read_data(os.path.join(data_dir, \"dev.txt\")), \"dev\"\n",
    "        )\n",
    "\n",
    "    def get_test_examples(self,data_dir):\n",
    "        return self._create_example(\n",
    "            self._read_data(os.path.join(data_dir, \"test.txt\")), \"test\")\n",
    "\n",
    "\n",
    "    def get_labels(self):\n",
    "        return [\"B-MISC\", \"I-MISC\", \"O\", \"B-PER\", \"I-PER\", \"B-ORG\", \"I-ORG\", \"B-LOC\", \"I-LOC\", \"X\",\"[CLS]\",\"[SEP]\"]\n",
    "\n",
    "    def _create_example(self, lines, set_type):\n",
    "        examples = []\n",
    "        for (i, line) in enumerate(lines):\n",
    "            guid = \"%s-%s\" % (set_type, i)\n",
    "            text = tokenization.convert_to_unicode(line[1])\n",
    "            label = tokenization.convert_to_unicode(line[0])\n",
    "            examples.append(InputExample(guid=guid, text=text, label=label))\n",
    "        return examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "processors = {\n",
    "    \"ner\": NerProcessor\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "processor = processors['ner']()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<__main__.NerProcessor at 0x7f19434904e0>"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "processor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "label_list = processor.get_labels()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_examples = processor.get_train_examples('HironsanData')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[]"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data_dir = './HironsanData'\n",
    "processor._read_data(os.path.join(data_dir, 'train.txt'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "path= os.path.join(data_dir, 'train.txt')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(path) as f:\n",
    "    lines = []\n",
    "    words = []\n",
    "    labels = []\n",
    "    labels_all = []\n",
    "    for line in f:\n",
    "        contends = line.strip()\n",
    "        word = line.strip().split('\\t')[0]\n",
    "        label = line.strip().split('\\t')[-1]\n",
    "        if contends.startswith(\"-DOCSTART-\"):\n",
    "            words.append('')\n",
    "            continue\n",
    "        if len(contends) == 0 and words[-1] == '。':\n",
    "            l = ' '.join([label for label in labels if len(label) > 0])\n",
    "            w = ' '.join([word for word in words if len(word) > 0])\n",
    "            lines.append([l, w])\n",
    "            words = []\n",
    "            labels = []\n",
    "            continue\n",
    "        words.append(word)\n",
    "        labels.append(label)\n",
    "        labels_all.extend(labels)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 34,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['I-ORG',\n",
       " 'B-DAT',\n",
       " 'B-ORG',\n",
       " 'I-ART',\n",
       " 'I-MNY',\n",
       " 'B-TIM',\n",
       " 'I-PNT',\n",
       " 'I-PSN',\n",
       " 'I-DAT',\n",
       " 'O',\n",
       " 'B-PNT',\n",
       " 'B-PSN',\n",
       " 'I-LOC',\n",
       " 'B-MNY',\n",
       " 'B-LOC',\n",
       " 'B-ART',\n",
       " 'I-TIM']"
      ]
     },
     "execution_count": 34,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(set(labels_all))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
